{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "de6537c4",
   "metadata": {},
   "source": [
    "# BatchEvalRunner - Running Multiple Evaluations\n",
    "\n",
    "The `BatchEvalRunner` class can be used to run a series of evaluations asynchronously. The async jobs are limited to a defined size of `num_workers`.\n",
    "\n",
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a8304f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# attach to the same event-loop\n",
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff59a851",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import openai\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
    "# openai.api_key = os.environ[\"OPENAI_API_KEY\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import (\n",
    "    VectorStoreIndex,\n",
    "    SimpleDirectoryReader,\n",
    "    ServiceContext,\n",
    "    Response,\n",
    ")\n",
    "from llama_index.llms import OpenAI\n",
    "from llama_index.evaluation import (\n",
    "    FaithfulnessEvaluator,\n",
    "    RelevancyEvaluator,\n",
    "    CorrectnessEvaluator,\n",
    ")\n",
    "import pandas as pd\n",
    "\n",
    "pd.set_option(\"display.max_colwidth\", 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efe66f2a",
   "metadata": {},
   "source": [
    "Using GPT-4 here for evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9b98f89-d5b8-4d29-92f6-ad76d5060e9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gpt-4\n",
    "gpt4 = OpenAI(temperature=0, model=\"gpt-4\")\n",
    "service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)\n",
    "\n",
    "faithfulness_gpt4 = FaithfulnessEvaluator(service_context=service_context_gpt4)\n",
    "relevancy_gpt4 = RelevancyEvaluator(service_context=service_context_gpt4)\n",
    "correctness_gpt4 = CorrectnessEvaluator(service_context=service_context_gpt4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1298bbb4-c99e-431e-93ef-eb32c0a2fc2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = SimpleDirectoryReader(\"./test_wiki_data/\").load_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41f0e53f-77a6-40d5-94ae-3f81b01af75c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create vector index\n",
    "llm = OpenAI(temperature=0.3, model=\"gpt-3.5-turbo\")\n",
    "service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)\n",
    "vector_index = VectorStoreIndex.from_documents(\n",
    "    documents, service_context=service_context\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "400f486d",
   "metadata": {},
   "source": [
    "## Question Generation\n",
    "\n",
    "To run evaluations in batch, you can create the runner and then call the `.aevaluate_queries()` function on a list of queries.\n",
    "\n",
    "First, we can generate some questions and then run evaluation on them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "976e0a93",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: spacy in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (3.7.2)\n",
      "Requirement already satisfied: datasets in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (2.15.0)\n",
      "Requirement already satisfied: span-marker in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (1.5.0)\n",
      "Requirement already satisfied: scikit-learn in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (1.3.2)\n",
      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (3.0.12)\n",
      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (1.0.5)\n",
      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (1.0.10)\n",
      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (2.0.8)\n",
      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (3.0.9)\n",
      "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (8.2.1)\n",
      "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (1.1.2)\n",
      "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (2.4.8)\n",
      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (2.0.10)\n",
      "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (0.3.4)\n",
      "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (0.9.0)\n",
      "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (6.4.0)\n",
      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (4.66.1)\n",
      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (2.31.0)\n",
      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (1.10.12)\n",
      "Requirement already satisfied: jinja2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (3.1.2)\n",
      "Requirement already satisfied: setuptools in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (69.0.2)\n",
      "Requirement already satisfied: packaging>=20.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (23.2)\n",
      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (3.3.0)\n",
      "Requirement already satisfied: numpy>=1.19.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (1.24.4)\n",
      "Requirement already satisfied: pyarrow>=8.0.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (14.0.1)\n",
      "Requirement already satisfied: pyarrow-hotfix in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (0.6)\n",
      "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (0.3.7)\n",
      "Requirement already satisfied: pandas in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (2.0.3)\n",
      "Requirement already satisfied: xxhash in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (3.4.1)\n",
      "Requirement already satisfied: multiprocess in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (0.70.15)\n",
      "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (2023.10.0)\n",
      "Requirement already satisfied: aiohttp in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (3.9.1)\n",
      "Requirement already satisfied: huggingface-hub>=0.18.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (0.19.4)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (6.0.1)\n",
      "Requirement already satisfied: torch in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from span-marker) (2.1.1)\n",
      "Requirement already satisfied: accelerate in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from span-marker) (0.25.0)\n",
      "Requirement already satisfied: transformers>=4.19.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from span-marker) (4.35.2)\n",
      "Requirement already satisfied: evaluate in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from span-marker) (0.4.1)\n",
      "Requirement already satisfied: seqeval in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from span-marker) (1.2.2)\n",
      "Requirement already satisfied: scipy>=1.5.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from scikit-learn) (1.11.4)\n",
      "Requirement already satisfied: joblib>=1.1.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from scikit-learn) (1.3.2)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from scikit-learn) (3.2.0)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from aiohttp->datasets) (23.1.0)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from aiohttp->datasets) (6.0.4)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from aiohttp->datasets) (1.9.3)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from aiohttp->datasets) (1.4.0)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from aiohttp->datasets) (1.3.1)\n",
      "Requirement already satisfied: filelock in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from huggingface-hub>=0.18.0->datasets) (3.13.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from huggingface-hub>=0.18.0->datasets) (4.8.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.6)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.18)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2023.11.17)\n",
      "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy) (0.7.11)\n",
      "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy) (0.1.4)\n",
      "Requirement already satisfied: regex!=2019.12.17 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from transformers>=4.19.0->span-marker) (2023.10.3)\n",
      "Requirement already satisfied: tokenizers<0.19,>=0.14 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from transformers>=4.19.0->span-marker) (0.15.0)\n",
      "Requirement already satisfied: safetensors>=0.3.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from transformers>=4.19.0->span-marker) (0.4.1)\n",
      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy) (8.1.7)\n",
      "Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy) (0.16.0)\n",
      "Requirement already satisfied: psutil in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from accelerate->span-marker) (5.9.6)\n",
      "Requirement already satisfied: sympy in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (1.12)\n",
      "Requirement already satisfied: networkx in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (3.2.1)\n",
      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (8.9.2.26)\n",
      "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.3.1)\n",
      "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (11.0.2.54)\n",
      "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (10.3.2.106)\n",
      "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (11.4.5.107)\n",
      "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.0.106)\n",
      "Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (2.18.1)\n",
      "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.105)\n",
      "Requirement already satisfied: triton==2.1.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (2.1.0)\n",
      "Requirement already satisfied: nvidia-nvjitlink-cu12 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->span-marker) (12.3.101)\n",
      "Requirement already satisfied: responses<0.19 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from evaluate->span-marker) (0.18.0)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from jinja2->spacy) (2.1.3)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from pandas->datasets) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from pandas->datasets) (2023.3.post1)\n",
      "Requirement already satisfied: tzdata>=2022.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from pandas->datasets) (2023.3)\n",
      "Requirement already satisfied: six>=1.5 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
      "Requirement already satisfied: mpmath>=0.19 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from sympy->torch->span-marker) (1.3.0)\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.1\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install spacy datasets span-marker scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e31e10e6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/loganm/llama_index_proper/llama_index/llama_index/evaluation/dataset_generation.py:187: DeprecationWarning: Call to deprecated class DatasetGenerator. (Deprecated in favor of `RagDatasetGenerator` which should be used instead.)\n",
      "  return cls(\n",
      "/home/loganm/llama_index_proper/llama_index/llama_index/evaluation/dataset_generation.py:282: DeprecationWarning: Call to deprecated class QueryResponseDataset. (Deprecated in favor of `LabelledRagDataset` which should be used instead.)\n",
      "  return QueryResponseDataset(queries=queries, responses=responses_dict)\n"
     ]
    }
   ],
   "source": [
    "from llama_index.evaluation import DatasetGenerator\n",
    "\n",
    "dataset_generator = DatasetGenerator.from_documents(\n",
    "    documents, service_context=service_context\n",
    ")\n",
    "\n",
    "qas = dataset_generator.generate_dataset_from_nodes(num=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f3d6861e",
   "metadata": {},
   "source": [
    "## Running Batch Evaluation\n",
    "\n",
    "Now, we can run our batch evaluation!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "180a5d2e-9286-477b-9cd0-a5976d18d845",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.evaluation import BatchEvalRunner\n",
    "\n",
    "runner = BatchEvalRunner(\n",
    "    {\"faithfulness\": faithfulness_gpt4, \"relevancy\": relevancy_gpt4},\n",
    "    workers=8,\n",
    ")\n",
    "\n",
    "eval_results = await runner.aevaluate_queries(\n",
    "    vector_index.as_query_engine(), queries=qas.questions\n",
    ")\n",
    "\n",
    "# If we had ground-truth answers, we could also include the correctness evaluator like below.\n",
    "# The correctness evaluator depends on additional kwargs, which are passed in as a dictionary.\n",
    "# Each question is mapped to a set of kwargs\n",
    "#\n",
    "\n",
    "# runner = BatchEvalRunner(\n",
    "#     {\"correctness\": correctness_gpt4},\n",
    "#     workers=8,\n",
    "# )\n",
    "\n",
    "# eval_results = await runner.aevaluate_queries(\n",
    "#     vector_index.as_query_engine(),\n",
    "#     queries=qas.queries,\n",
    "#     reference=[qr[1] for qr in qas.qr_pairs],\n",
    "# )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0eff6823",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3\n"
     ]
    }
   ],
   "source": [
    "print(len([qr for qr in qas.qr_pairs]))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b256b98c",
   "metadata": {},
   "source": [
    "## Inspecting Outputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4ab78d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['correctness'])\n",
      "dict_keys(['query', 'contexts', 'response', 'passing', 'feedback', 'score', 'pairwise_source'])\n",
      "False\n",
      "The context information does not provide any information related to the query. Therefore, I cannot provide an answer based on the given context.\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "print(eval_results.keys())\n",
    "\n",
    "print(eval_results[\"correctness\"][0].dict().keys())\n",
    "\n",
    "print(eval_results[\"correctness\"][0].passing)\n",
    "print(eval_results[\"correctness\"][0].response)\n",
    "print(eval_results[\"correctness\"][0].contexts)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d8b20df9",
   "metadata": {},
   "source": [
    "## Reporting Total Scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35fdbe0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_eval_results(key, eval_results):\n",
    "    results = eval_results[key]\n",
    "    correct = 0\n",
    "    for result in results:\n",
    "        if result.passing:\n",
    "            correct += 1\n",
    "    score = correct / len(results)\n",
    "    print(f\"{key} Score: {score}\")\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2435e398",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "correctness Score: 0.0\n"
     ]
    }
   ],
   "source": [
    "score = get_eval_results(\"correctness\", eval_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f57c938",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "relevancy Score: 0.96\n"
     ]
    }
   ],
   "source": [
    "score = get_eval_results(\"relevancy\", eval_results)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama-index-4a-wkI5X-py3.11",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
